#importing all the libraries
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
#Loading the dataset
project_data = pd.read_csv('train_data.csv')
resource_data = pd.read_csv('resources.csv')
project_data.head(3)
#In the values we have considered, we have the division of accepted and rejected as follows.
#It can be said that it is an imbalanced dataset
project_data['project_is_approved'].value_counts()
# https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
# join two dataframes in python:
project_data = pd.merge(project_data, price_data, on='id', how='left')
print("Summary of Data: ", project_data.info())
# how to replace elements in list python: https://stackoverflow.com/a/2582163/4084039
cols = ['Date' if x=='project_submitted_datetime' else x for x in list(project_data.columns)]
#sort dataframe based on time pandas python: https://stackoverflow.com/a/49702492/4084039
project_data['Date'] = pd.to_datetime(project_data['project_submitted_datetime'])
project_data.drop('project_submitted_datetime', axis=1, inplace=True)
project_data.sort_values(by=['Date'], inplace=True)
# how to reorder columns pandas python: https://stackoverflow.com/a/13148611/4084039
project_data = project_data[cols]
project_data.head(2)
catogories = list(project_data['project_subject_categories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
temp = temp.replace('&','_') # we are replacing the & value into
cat_list.append(temp.strip())
project_data['clean_categories'] = cat_list
project_data.drop(['project_subject_categories'], axis=1, inplace=True)
from collections import Counter
my_counter = Counter()
for word in project_data['clean_categories'].values:
my_counter.update(word.split())
cat_dict = dict(my_counter)
sorted_cat_dict = dict(sorted(cat_dict.items(), key=lambda kv: kv[1]))
sub_catogories = list(project_data['project_subject_subcategories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
sub_cat_list = []
for i in sub_catogories:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
temp = temp.replace('&','_')
sub_cat_list.append(temp.strip())
project_data['clean_subcategories'] = sub_cat_list
project_data.drop(['project_subject_subcategories'], axis=1, inplace=True)
# count of all the words in corpus python: https://stackoverflow.com/a/22898595/4084039
my_counter = Counter()
for word in project_data['clean_subcategories'].values:
my_counter.update(word.split())
sub_cat_dict = dict(my_counter)
sorted_sub_cat_dict = dict(sorted(sub_cat_dict.items(), key=lambda kv: kv[1]))
#Citation:
#url: https://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without
project_data[project_data['teacher_prefix'].isnull()]
#Dropping the rows which has NaN values.
project_data.drop([30368, 57654, 7820], inplace=True)
#Pre processing of essay.
# merge two column text dataframe:
project_data["essay"] = project_data["project_essay_1"].map(str) +\
project_data["project_essay_2"].map(str) + \
project_data["project_essay_3"].map(str) + \
project_data["project_essay_4"].map(str)
# printing some random reviews
print(project_data['essay'].values[0])
print("="*50)
print(project_data['essay'].values[150])
print("="*50)
print(project_data['essay'].values[1000])
print("="*50)
print(project_data['essay'].values[20000])
print("="*50)
# https://stackoverflow.com/a/47091490/4084039
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
sent = decontracted(project_data['essay'].values[20000])
print(sent)
print("="*50)
# \r \n \t remove from string python: http://texthandler.com/info/remove-line-breaks-python/
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
print(sent)
#remove special character: https://stackoverflow.com/a/5843547/4084039
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
print(sent)
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"]
# Combining all the above stundents
from tqdm import tqdm
preprocessed_essays = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['essay'].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
preprocessed_essays.append(sent.lower().strip())
#preprocessing of project title.
from tqdm import tqdm
preprocessed_title = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['project_title'].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
preprocessed_title.append(sent.lower().strip())
#Replacing actual column values with the preprocessed ones.
project_data['project_title'] = preprocessed_title
project_data['essay'] = preprocessed_essays
#Since we have summed essay into one. We dont need 4 essays.
#Citation: pandas drop a column
#url: https://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe-by-column-name
columns = ['project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4']
project_data.drop(columns, axis=1, inplace=True)
y = project_data['project_is_approved'].values
project_data.drop(['project_is_approved'], axis=1, inplace=True)
X = project_data
#As you can see we have four categories.
#While vectorizing this we get 'grades' separately. That is,
#Prek-2, 3-5, 6-8, 9-12, Grades -> We get 5 categories which is wrong. Hence rectifying it
X['project_grade_category'].value_counts()
#We will replace the value of Grades Prek-2 with Grades-Prek-2
#And subsequently for other categories.
grade = X['project_grade_category']
#Citation
#url: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
grade.replace('Grades PreK-2', 'Grades-PreK-2', inplace=True)
grade.replace('Grades 3-5', 'Grades-3-5', inplace=True)
grade.replace('Grades 6-8', 'Grades-6-8', inplace=True)
grade.replace('Grades 9-12', 'Grades-9-12', inplace=True)
#Assigning new grades with to the column
grade.head()
X['project_grade_category'] = grade
#Counting the number of words in project_title for SET-5
#Citation: calculate number of words in text dataframe
#url: https://stackoverflow.com/questions/49984905/count-number-of-words-per-row
X['totalwords_title'] = X['project_title'].str.split().str.len()
#Counting the number of words in essay for SET-5
#Citation: calculate number of words in text dataframe
#url: https://stackoverflow.com/questions/49984905/count-number-of-words-per-row
X['totalwords_essay'] = X['essay'].str.split().str.len()
#Citation: store sentiment score in dataframe
#https://stackoverflow.com/questions/46764674/sentiment-analysis-on-dataframe
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
abc = X['essay'].apply(lambda Text: sid.polarity_scores(Text))
# we can use these 4 things as features/attributes (neg, neu, pos, compound)
# neg: 0.0, neu: 0.753, pos: 0.247, compound: 0.93
#We can see that it returns a dictionary (key-value pairs). Therefore we will convert Series to dict
type(abc)
final = abc.to_dict()
#What we have here is a Series, converting series to Pandas dataframe using from_dict() method
type(final)
final2 = pd.DataFrame.from_dict(final)
#We will need to transpose the DataFrame to get negative, positive etc values in the column region.
final2.head()
final2 = final2.T
#This is the proper DataFrame.
final2.head()
#Adding these values to our main Dataframe.
X['Compound Score'] = final2['compound']
X['Negative Score'] = final2['neg']
X['Neutral Score'] = final2['neu']
X['Positive Score'] = final2['pos']
#Final dataframe
X.head(3)
#final shape
X.shape
# train test split before vectorizing or performing any feature engineering techniques
#as doing it before leads to data leakage.
from sklearn.model_selection import train_test_split
#shuffle=False for time based splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, shuffle=False)
%%time
#BoW for essay.
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)
print("="*100)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_essay = CountVectorizer(min_df=10)
vectorizer_essay.fit(X_train['essay'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_essay_bow = vectorizer_essay.transform(X_train['essay'].values)
X_cv_essay_bow = vectorizer_essay.transform(X_cv['essay'].values)
X_test_essay_bow = vectorizer_essay.transform(X_test['essay'].values)
print("After vectorizations")
print(X_train_essay_bow.shape, y_train.shape)
print(X_cv_essay_bow.shape, y_cv.shape)
print(X_test_essay_bow.shape, y_test.shape)
%%time
#BoW for project-title.
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)
print("="*100)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_title = CountVectorizer(min_df=10)
vectorizer_title.fit(X_train['project_title'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_title_bow = vectorizer_title.transform(X_train['project_title'].values)
X_cv_title_bow = vectorizer_title.transform(X_cv['project_title'].values)
X_test_title_bow = vectorizer_title.transform(X_test['project_title'].values)
print("After vectorizations")
print(X_train_title_bow.shape, y_train.shape)
print(X_cv_title_bow.shape, y_cv.shape)
print(X_test_title_bow.shape, y_test.shape)
#BoW for project-resource-summary
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)
print("="*100)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_resum = CountVectorizer(min_df=10)
vectorizer_resum.fit(X_train['project_resource_summary'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_resum_bow = vectorizer_resum.transform(X_train['project_resource_summary'].values)
X_cv_resum_bow = vectorizer_resum.transform(X_cv['project_resource_summary'].values)
X_test_resum_bow = vectorizer_resum.transform(X_test['project_resource_summary'].values)
print("After vectorizations")
print(X_train_resum_bow.shape, y_train.shape)
print(X_cv_resum_bow.shape, y_cv.shape)
print(X_test_resum_bow.shape, y_test.shape)
#TF-IDF Encoding on project essay
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_essay_tfidf = TfidfVectorizer(min_df=10)
vectorizer_essay_tfidf.fit(X_train['essay'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_essay_tfidf = vectorizer_essay_tfidf.transform(X_train['essay'].values)
X_cv_essay_tfidf = vectorizer_essay_tfidf.transform(X_cv['essay'].values)
X_test_essay_tfidf = vectorizer_essay_tfidf.transform(X_test['essay'].values)
print("After vectorizations")
print(X_train_essay_tfidf.shape, y_train.shape)
print(X_cv_essay_tfidf.shape, y_cv.shape)
print(X_test_essay_tfidf.shape, y_test.shape)
#TF-IDF Encoding on project title
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_title_tfidf = CountVectorizer(min_df=10)
vectorizer_title_tfidf.fit(X_train['project_title'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_title_tfidf = vectorizer_title_tfidf.transform(X_train['project_title'].values)
X_cv_title_tfidf = vectorizer_title_tfidf.transform(X_cv['project_title'].values)
X_test_title_tfidf = vectorizer_title_tfidf.transform(X_test['project_title'].values)
print("After vectorizations")
print(X_train_title_tfidf.shape, y_train.shape)
print(X_cv_title_tfidf.shape, y_cv.shape)
print(X_test_title_tfidf.shape, y_test.shape)
#TF-IDF Encoding on project res summary
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_resum_tfidf = CountVectorizer(min_df=10)
vectorizer_resum_tfidf.fit(X_train['project_resource_summary'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_resum_tfidf = vectorizer_title_tfidf.transform(X_train['project_resource_summary'].values)
X_cv_resum_tfidf = vectorizer_title_tfidf.transform(X_cv['project_resource_summary'].values)
X_test_resum_tfidf = vectorizer_title_tfidf.transform(X_test['project_resource_summary'].values)
print("After vectorizations")
print(X_train_resum_tfidf.shape, y_train.shape)
print(X_cv_resum_tfidf.shape, y_cv.shape)
print(X_test_resum_tfidf.shape, y_test.shape)
#ON PRE-PROCESSED ESSAY
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
#Train part of preprocessed essays.
train_w2v_vectors_essays = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_train['essay'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
train_w2v_vectors_essays.append(vector)
print("Train Vector for essay")
print(len(train_w2v_vectors_essays))
print(len(train_w2v_vectors_essays[0]))
#Test part of preprocessed essays.
# average Word2Vec
# compute average word2vec for each review.
test_w2v_vectors_essays = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_test['essay'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
test_w2v_vectors_essays.append(vector)
print("Test Vector for essay")
print(len(test_w2v_vectors_essays))
print(len(test_w2v_vectors_essays[0]))
#CV part of preprocessed essays.
# average Word2Vec
# compute average word2vec for each review.
cv_w2v_vectors_essays = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_cv['essay'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
cv_w2v_vectors_essays.append(vector)
print("CV vector for essay")
print(len(cv_w2v_vectors_essays))
print(len(cv_w2v_vectors_essays[0]))
# Changing the lists (Train, Test, CV) to numpy arrays
train_w2v_vectors_essays = np.array(train_w2v_vectors_essays)
test_w2v_vectors_essays = np.array(test_w2v_vectors_essays)
cv_w2v_vectors_essays = np.array(cv_w2v_vectors_essays)
#Following the same process for preprocessed titles.
# average Word2Vec
# compute average word2vec for each review.
w2v_train_vectors_titles = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_train['project_title'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_train_vectors_titles.append(vector)
print("Train Vector for project title")
print(len(w2v_train_vectors_titles))
print(len(w2v_train_vectors_titles[0]))
# average Word2Vec
# compute average word2vec for each review.
w2v_test_vectors_titles = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_test['project_title'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_test_vectors_titles.append(vector)
print("Test Vector for project title")
print(len(w2v_test_vectors_titles))
print(len(w2v_test_vectors_titles[0]))
# average Word2Vec
# compute average word2vec for each review.
w2v_cv_vectors_titles = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_cv['project_title'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_cv_vectors_titles.append(vector)
print("CV Vector for project title")
print(len(w2v_cv_vectors_titles))
print(len(w2v_cv_vectors_titles[0]))
# Changing the lists (Train, Test, CV) to numpy arrays
w2v_train_vectors_titles = np.array(w2v_train_vectors_titles)
w2v_test_vectors_titles = np.array(w2v_test_vectors_titles)
w2v_cv_vectors_titles = np.array(w2v_cv_vectors_titles)
#Following the same process for project resource summary.
# average Word2Vec
# compute average word2vec for each review.
w2v_train_vectors_resum = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_train['project_resource_summary'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_train_vectors_resum.append(vector)
print("Train Vector for project resource summary")
print(len(w2v_train_vectors_resum))
print(len(w2v_train_vectors_resum[0]))
# average Word2Vec
# compute average word2vec for each review.
w2v_test_vectors_resum = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_test['project_resource_summary'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_test_vectors_resum.append(vector)
print("Test Vector for project resource summary")
print(len(w2v_test_vectors_resum))
print(len(w2v_test_vectors_resum[0]))
# average Word2Vec
# compute average word2vec for each review.
w2v_cv_vectors_resum = []; # the avg-w2v for each essay is stored in this list
for sentence in tqdm(X_cv['project_resource_summary'].values): # for each essay in training data
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the essay
for word in sentence.split(): # for each word in a essay
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
w2v_cv_vectors_resum.append(vector)
print("CV Vector for project resource summary")
print(len(w2v_cv_vectors_resum))
print(len(w2v_cv_vectors_resum[0]))
# Changing the lists (Train, Test, CV) to numpy arrays
w2v_train_vectors_resum = np.array(w2v_train_vectors_resum)
w2v_test_vectors_resum = np.array(w2v_test_vectors_resum)
w2v_cv_vectors_resum = np.array(w2v_cv_vectors_resum)
tfidf_model_essay = TfidfVectorizer()
tfidf_model_essay.fit(X_train['essay'].values)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model_essay.get_feature_names(), list(tfidf_model_essay.idf_)))
tfidf_words = set(tfidf_model_essay.get_feature_names())
#ON PRE-PROCESSED ESSAY
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
train_tfidf_w2v_essay = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_train['essay']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_tfidf_w2v_essay.append(vector)
print(len(train_tfidf_w2v_essay))
print(len(train_tfidf_w2v_essay[0]))
cv_tfidf_w2v_essay = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_cv['essay']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
cv_tfidf_w2v_essay.append(vector)
print(len(cv_tfidf_w2v_essay))
print(len(cv_tfidf_w2v_essay[0]))
test_tfidf_w2v_essay = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['essay']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
test_tfidf_w2v_essay.append(vector)
print(len(test_tfidf_w2v_essay))
print(len(test_tfidf_w2v_essay[0]))
# Changing list to numpy arrays
train_tfidf_w2v_essay = np.array(train_tfidf_w2v_essay)
test_tfidf_w2v_essay = np.array(test_tfidf_w2v_essay)
cv_tfidf_w2v_essay = np.array(cv_tfidf_w2v_essay)
tfidf_model_title = TfidfVectorizer()
tfidf_model_title.fit(X_train['project_title'].values)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model_title.get_feature_names(), list(tfidf_model_title.idf_)))
tfidf_words = set(tfidf_model_title.get_feature_names())
#ON PREPROCESSED TITLE
train_tfidf_w2v_title = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_train['project_title']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_tfidf_w2v_title.append(vector)
print(len(train_tfidf_w2v_title))
print(len(train_tfidf_w2v_title[0]))
cv_tfidf_w2v_title = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_cv['project_title']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
cv_tfidf_w2v_title.append(vector)
print(len(cv_tfidf_w2v_title))
print(len(cv_tfidf_w2v_title[0]))
test_tfidf_w2v_title = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['project_title']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
test_tfidf_w2v_title.append(vector)
print(len(test_tfidf_w2v_title))
print(len(test_tfidf_w2v_title[0]))
# Changing list to numpy arrays
train_tfidf_w2v_title = np.array(train_tfidf_w2v_title)
test_tfidf_w2v_title = np.array(test_tfidf_w2v_title)
cv_tfidf_w2v_title = np.array(cv_tfidf_w2v_title)
tfidf_model_res_sum = TfidfVectorizer()
tfidf_model_res_sum.fit(X_train['project_resource_summary'].values)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model_res_sum.get_feature_names(), list(tfidf_model_res_sum.idf_)))
tfidf_words = set(tfidf_model_res_sum.get_feature_names())
#ON PREPROCESSED TITLE
train_tfidf_w2v_resum = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_train['project_resource_summary']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_tfidf_w2v_resum.append(vector)
print(len(train_tfidf_w2v_resum))
print(len(train_tfidf_w2v_resum[0]))
cv_tfidf_w2v_resum = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_cv['project_resource_summary']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
cv_tfidf_w2v_resum.append(vector)
print(len(cv_tfidf_w2v_resum))
print(len(cv_tfidf_w2v_resum[0]))
test_tfidf_w2v_resum = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['project_resource_summary']): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
test_tfidf_w2v_resum.append(vector)
print(len(test_tfidf_w2v_resum))
print(len(test_tfidf_w2v_resum[0]))
# Changing list to numpy arrays
train_tfidf_w2v_resum = np.array(train_tfidf_w2v_resum)
test_tfidf_w2v_resum = np.array(test_tfidf_w2v_resum)
cv_tfidf_w2v_resum = np.array(cv_tfidf_w2v_resum)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_state_ohe = vectorizer.transform(X_train['school_state'].values)
X_cv_state_ohe = vectorizer.transform(X_cv['school_state'].values)
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values)
print("After vectorizations")
print(X_train_state_ohe.shape, y_train.shape)
print(X_cv_state_ohe.shape, y_cv.shape)
print(X_test_state_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values)
X_cv_teacher_ohe = vectorizer.transform(X_cv['teacher_prefix'].values)
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values)
print("After vectorizations")
print(X_train_teacher_ohe.shape, y_train.shape)
print(X_cv_teacher_ohe.shape, y_cv.shape)
print(X_test_teacher_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
from collections import Counter
Counter = Counter()
for word in X_train['project_grade_category'].values:
Counter.update(word.split())
# dict sort by value python: https://stackoverflow.com/a/613218/4084039
project_grade_category_dict = dict(Counter)
sorted_project_grade_category_dict = dict(sorted(project_grade_category_dict.items(), key=lambda kv: kv[1]))
vectorizer = CountVectorizer(vocabulary=list(sorted_project_grade_category_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(X_train['project_grade_category'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_grade_ohe = vectorizer.transform(X_train['project_grade_category'].values)
X_cv_grade_ohe = vectorizer.transform(X_cv['project_grade_category'].values)
X_test_grade_ohe = vectorizer.transform(X_test['project_grade_category'].values)
print("After vectorizations")
print(X_train_grade_ohe.shape, y_train.shape)
print(X_cv_grade_ohe.shape, y_cv.shape)
print(X_test_grade_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_categories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_cat_ohe = vectorizer.transform(X_train['clean_categories'].values)
X_cv_clean_cat_ohe = vectorizer.transform(X_cv['clean_categories'].values)
X_test_clean_cat_ohe = vectorizer.transform(X_test['clean_categories'].values)
print("After vectorizations")
print(X_train_clean_cat_ohe.shape, y_train.shape)
print(X_cv_clean_cat_ohe.shape, y_cv.shape)
print(X_test_clean_cat_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['clean_subcategories'].values) # fit has to happen only on train data
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_subcat_ohe = vectorizer.transform(X_train['clean_subcategories'].values)
X_cv_clean_subcat_ohe = vectorizer.transform(X_cv['clean_subcategories'].values)
X_test_clean_subcat_ohe = vectorizer.transform(X_test['clean_subcategories'].values)
print("After vectorizations")
print(X_train_clean_subcat_ohe.shape, y_train.shape)
print(X_cv_clean_subcat_ohe.shape, y_cv.shape)
print(X_test_clean_subcat_ohe.shape, y_test.shape)
print(vectorizer.get_feature_names())
print("="*100)
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
standard_vec = StandardScaler()
standard_vec.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_train_TNPPP_std = standard_vec.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_cv_TNPP_std = standard_vec.transform(X_cv['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_TNPP_std = standard_vec.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_TNPPP_std.shape, y_train.shape)
print(X_cv_TNPP_std.shape, y_cv.shape)
print(X_test_TNPP_std.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import StandardScaler
standard_vec = StandardScaler()
standard_vec.fit(X_train['price'].values.reshape(-1,1))
X_train_price_std = standard_vec.transform(X_train['price'].values.reshape(-1,1))
X_cv_price_std = standard_vec.transform(X_cv['price'].values.reshape(-1,1))
X_test_price_std = standard_vec.transform(X_test['price'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_price_std.shape, y_train.shape)
print(X_cv_price_std.shape, y_cv.shape)
print(X_test_price_std.shape, y_test.shape)
print("="*100)
from sklearn.preprocessing import StandardScaler
standard_vec = StandardScaler()
standard_vec.fit(X_train['quantity'].values.reshape(-1,1))
X_train_quantity_std = standard_vec.transform(X_train['quantity'].values.reshape(-1,1))
X_cv_quantity_std = standard_vec.transform(X_cv['quantity'].values.reshape(-1,1))
X_test_quantity_std = standard_vec.transform(X_test['quantity'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_quantity_std.shape, y_train.shape)
print(X_cv_quantity_std.shape, y_cv.shape)
print(X_test_quantity_std.shape, y_test.shape)
print("="*100)
#Standardizing totalwords as its a numerical feature.
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
standard_vec = StandardScaler()
standard_vec.fit(X_train['totalwords_title'].values.reshape(-1,1))
X_train_titlecount_std = standard_vec.transform(X_train['totalwords_title'].values.reshape(-1,1))
X_cv_titlecount_std = standard_vec.transform(X_cv['totalwords_title'].values.reshape(-1,1))
X_test_titlecount_std = standard_vec.transform(X_test['totalwords_title'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_titlecount_std.shape, y_train.shape)
print(X_cv_titlecount_std.shape, y_cv.shape)
print(X_test_titlecount_std.shape, y_test.shape)
print("="*100)
#Standardizing totalwords as its a numerical feature.
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
standard_vec = StandardScaler()
standard_vec.fit(X_train['totalwords_essay'].values.reshape(-1,1))
X_train_essaycount_std = standard_vec.transform(X_train['totalwords_essay'].values.reshape(-1,1))
X_cv_essaycount_std = standard_vec.transform(X_cv['totalwords_essay'].values.reshape(-1,1))
X_test_essaycount_std = standard_vec.transform(X_test['totalwords_essay'].values.reshape(-1,1))
print("After vectorizations")
print(X_train_essaycount_std.shape, y_train.shape)
print(X_cv_essaycount_std.shape, y_cv.shape)
print(X_test_essaycount_std.shape, y_test.shape)
print("="*100)
#Compound Score Array
#You can either do reshape(-1, 1) or <variable_name>.T to transpose your array. I have chosen reshape(-1, 1)
X_train_compsc = np.array([X_train['Compound Score']]).reshape(-1, 1)
X_cv_compsc = np.array([X_cv['Compound Score']]).reshape(-1, 1)
X_test_compsc = np.array([X_test['Compound Score']]).reshape(-1, 1)
#Neutral Score Array
X_train_neusc = np.array([X_train['Neutral Score']]).reshape(-1, 1)
X_cv_neusc = np.array([X_cv['Neutral Score']]).reshape(-1, 1)
X_test_neusc = np.array([X_test['Neutral Score']]).reshape(-1, 1)
#Negative Score Array
X_train_negsc = np.array([X_train['Negative Score']]).reshape(-1, 1)
X_cv_negsc = np.array([X_cv['Negative Score']]).reshape(-1, 1)
X_test_negsc = np.array([X_test['Negative Score']]).reshape(-1, 1)
#Positive Score Array
X_train_possc = np.array([X_train['Positive Score']]).reshape(-1, 1)
X_cv_possc = np.array([X_cv['Positive Score']]).reshape(-1, 1)
X_test_possc = np.array([X_test['Positive Score']]).reshape(-1, 1)
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
#Concatinating features for BoW
from scipy.sparse import hstack
X_tr_bow = hstack((X_train_essay_bow, X_train_title_bow, X_train_resum_bow, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe, X_train_clean_cat_ohe, X_train_clean_subcat_ohe, X_train_TNPPP_std, X_train_price_std, X_train_quantity_std, X_train_titlecount_std, X_train_essaycount_std, X_train_compsc, X_train_neusc, X_train_negsc, X_train_possc)).tocsr()
X_cv_bow = hstack((X_cv_essay_bow, X_cv_title_bow, X_cv_resum_bow, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_subcat_ohe, X_cv_TNPP_std, X_cv_price_std, X_cv_quantity_std, X_cv_titlecount_std, X_cv_essaycount_std, X_cv_compsc, X_cv_neusc, X_cv_negsc, X_cv_possc)).tocsr()
X_te_bow = hstack((X_test_essay_bow, X_test_title_bow, X_test_resum_bow, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_clean_cat_ohe, X_test_clean_subcat_ohe, X_test_TNPP_std, X_test_price_std, X_test_quantity_std, X_test_titlecount_std, X_test_essaycount_std, X_test_compsc, X_test_neusc, X_test_negsc, X_test_possc)).tocsr()
print("Final Data matrix")
print(X_tr_bow.shape, y_train.shape)
print(X_cv_bow.shape, y_cv.shape)
print(X_te_bow.shape, y_test.shape)
print("="*100)
#Concatinating features for TFIDF
from scipy.sparse import hstack
X_tr_tfidf = hstack((X_train_essay_tfidf, X_train_title_tfidf, X_train_resum_tfidf, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe, X_train_clean_cat_ohe, X_train_clean_subcat_ohe, X_train_TNPPP_std, X_train_price_std, X_train_quantity_std, X_train_titlecount_std, X_train_essaycount_std, X_train_compsc, X_train_neusc, X_train_negsc, X_train_possc)).tocsr()
X_cv_tfidf = hstack((X_cv_essay_tfidf, X_cv_title_tfidf, X_cv_resum_tfidf, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_subcat_ohe, X_cv_TNPP_std, X_cv_price_std, X_cv_quantity_std, X_cv_titlecount_std, X_cv_essaycount_std, X_cv_compsc, X_cv_neusc, X_cv_negsc, X_cv_possc)).tocsr()
X_te_tfidf = hstack((X_test_essay_tfidf, X_test_title_tfidf, X_test_resum_tfidf, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_clean_cat_ohe, X_test_clean_subcat_ohe, X_test_TNPP_std, X_test_price_std, X_test_quantity_std, X_test_titlecount_std, X_test_essaycount_std, X_test_compsc, X_test_neusc, X_test_negsc, X_test_possc)).tocsr()
print("Final Data matrix")
print(X_tr_tfidf.shape, y_train.shape)
print(X_cv_tfidf.shape, y_cv.shape)
print(X_te_tfidf.shape, y_test.shape)
print("="*100)
#Concatinating features for AVG W2V
from scipy.sparse import hstack
X_tr_avgw2v = hstack((train_w2v_vectors_essays, w2v_train_vectors_titles, w2v_train_vectors_resum, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe, X_train_clean_cat_ohe, X_train_clean_subcat_ohe, X_train_TNPPP_std, X_train_price_std, X_train_quantity_std, X_train_titlecount_std, X_train_essaycount_std, X_train_compsc, X_train_neusc, X_train_negsc, X_train_possc)).tocsr()
X_cv_avgw2v = hstack((cv_w2v_vectors_essays, w2v_cv_vectors_titles, w2v_cv_vectors_resum, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_subcat_ohe, X_cv_TNPP_std, X_cv_price_std, X_cv_quantity_std, X_cv_titlecount_std, X_cv_essaycount_std, X_cv_compsc, X_cv_neusc, X_cv_negsc, X_cv_possc)).tocsr()
X_te_avgw2v = hstack((test_w2v_vectors_essays, w2v_test_vectors_titles, w2v_test_vectors_resum, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_clean_cat_ohe, X_test_clean_subcat_ohe, X_test_TNPP_std, X_test_price_std, X_test_quantity_std, X_test_titlecount_std, X_test_essaycount_std, X_test_compsc, X_test_neusc, X_test_negsc, X_test_possc)).tocsr()
print("Final Data matrix")
print(X_tr_avgw2v.shape, y_train.shape)
print(X_cv_avgw2v.shape, y_cv.shape)
print(X_te_avgw2v.shape, y_test.shape)
print("="*100)
#Concatinating features for TFIDF W2V
from scipy.sparse import hstack
X_tr_tfidfw2v = hstack((train_tfidf_w2v_essay, train_tfidf_w2v_title, train_tfidf_w2v_resum, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe, X_train_clean_cat_ohe, X_train_clean_subcat_ohe, X_train_TNPPP_std, X_train_price_std, X_train_quantity_std, X_train_titlecount_std, X_train_essaycount_std, X_train_compsc, X_train_neusc, X_train_negsc, X_train_possc)).tocsr()
X_cv_tfidfw2v = hstack((cv_tfidf_w2v_essay, cv_tfidf_w2v_title, cv_tfidf_w2v_resum, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_subcat_ohe, X_cv_TNPP_std, X_cv_price_std, X_cv_quantity_std, X_cv_titlecount_std, X_cv_essaycount_std, X_cv_compsc, X_cv_neusc, X_cv_negsc, X_cv_possc)).tocsr()
X_te_tfidfw2v = hstack((test_tfidf_w2v_essay, test_tfidf_w2v_title, test_tfidf_w2v_resum, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_clean_cat_ohe, X_test_clean_subcat_ohe, X_test_TNPP_std, X_test_price_std, X_test_quantity_std, X_test_titlecount_std, X_test_essaycount_std, X_test_compsc, X_test_neusc, X_test_negsc, X_test_possc)).tocsr()
print("Final Data matrix")
print(X_tr_tfidfw2v.shape, y_train.shape)
print(X_cv_tfidfw2v.shape, y_cv.shape)
print(X_te_tfidfw2v.shape, y_test.shape)
print("="*100)
#Concatinating features for TFIDF W2V
#As mentioned in the assignment I am excluding project_title and only using TfidfVectorizer of essay_text.
#Also excluding project_resource summary.
#Converting d to d' using TruncatedSVD has been done in section 5.1.
#Concatinating features for TFIDF
from scipy.sparse import hstack
X_tr_set5 = hstack((X_train_trunsvd_tfidf, X_train_state_ohe, X_train_teacher_ohe, X_train_grade_ohe, X_train_clean_cat_ohe, X_train_clean_subcat_ohe, X_train_TNPPP_std, X_train_price_std, X_train_quantity_std, X_train_titlecount_std, X_train_essaycount_std, X_train_compsc, X_train_neusc, X_train_negsc, X_train_possc)).tocsr()
X_cv_set5 = hstack((X_cv_trunsvd_tfidf, X_cv_state_ohe, X_cv_teacher_ohe, X_cv_grade_ohe, X_cv_clean_cat_ohe, X_cv_clean_subcat_ohe, X_cv_TNPP_std, X_cv_price_std, X_cv_quantity_std, X_cv_titlecount_std, X_cv_essaycount_std, X_cv_compsc, X_cv_neusc, X_cv_negsc, X_cv_possc)).tocsr()
X_te_set5 = hstack((X_test_trunsvd_tfidf, X_test_state_ohe, X_test_teacher_ohe, X_test_grade_ohe, X_test_clean_cat_ohe, X_test_clean_subcat_ohe, X_test_TNPP_std, X_test_price_std, X_test_quantity_std, X_test_titlecount_std, X_test_essaycount_std, X_test_compsc, X_test_neusc, X_test_negsc, X_test_possc)).tocsr()
print("Final Data matrix")
print(X_tr_set5.shape, y_train.shape)
print(X_cv_set5.shape, y_cv.shape)
print(X_te_set5.shape, y_test.shape)
print("="*100)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L1 Regularization!!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_bow = SGDClassifier(loss='hinge', penalty='l1', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid = GridSearchCV(model_bow, param_grid, cv=3, scoring='roc_auc')
grid.fit(X_tr_bow, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid.cv_results_["mean_train_score"]
train_scores_std = grid.cv_results_["std_train_score"]
cv_auc = grid.cv_results_["mean_test_score"]
cv_scores_std = grid.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid.best_params_)
print('AUC with the best parameters: ', grid.best_score_)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L1 Regularization!!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_bow_l2 = SGDClassifier(loss='hinge', penalty='l2', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_bow_l2 = GridSearchCV(model_bow_l2, param_grid, cv=3, scoring='roc_auc')
grid_bow_l2.fit(X_tr_bow, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_bow_l2.cv_results_["mean_train_score"]
train_scores_std = grid_bow_l2.cv_results_["std_train_score"]
cv_auc = grid_bow_l2.cv_results_["mean_test_score"]
cv_scores_std = grid_bow_l2.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid_bow_l2.best_params_)
print('AUC with the best parameters: ', grid_bow_l2.best_score_)
Analysis:
%%time
#Citation: plot roc auc curve
#url: https://stackabuse.com/understanding-roc-curves-with-python/
best_alpha = 0.1
best_penalty = 'l2'
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
final_model_bow = SGDClassifier(loss='hinge', alpha=best_alpha, penalty=best_penalty, class_weight='balanced', n_jobs = -1)
#Using CalibratedClassifierCV as SVM doesnt natively support probabilities
calibrated = CalibratedClassifierCV(final_model_bow, method='sigmoid', cv=5)
calibrated.fit(X_tr_bow, y_train)
def plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr):
plt.plot(train_fpr, train_tpr, color='red', label='ROC for train')
plt.plot(test_fpr, test_tpr, color='orange', label='ROC for test')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('C: Hyperparameter')
plt.ylabel('AUC')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
y_test_pred = calibrated.predict_proba(X_te_bow)[:, 1]
y_train_pred = calibrated.predict_proba(X_tr_bow)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
print('AUC of Train Data: %.2f' % auc_train)
auc_test = roc_auc_score(y_test, y_test_pred)
print('AUC of Test Data: %.2f' % auc_test)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)
plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def predict(proba, threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
predictions = []
for i in proba:
if i>=t:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
print("Train confusion matrix")
cm_train = confusion_matrix(y_train, predict(y_train_pred, train_thresholds, train_fpr, train_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_train, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Train Confusion Matrix\n",size=24)
plt.show()
print("Test confusion matrix")
cm_test = confusion_matrix(y_test, predict(y_test_pred, test_thresholds, test_fpr, test_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_test, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Test Confusion Matrix\n",size=24)
plt.show()
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L1 Regularization!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidf = SGDClassifier(loss='hinge', penalty='l1', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidf_l1 = GridSearchCV(model_tfidf, param_grid, cv=3, scoring='roc_auc')
grid_tfidf_l1.fit(X_tr_tfidf, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidf_l1.cv_results_["mean_train_score"]
train_scores_std = grid_tfidf_l1.cv_results_["std_train_score"]
cv_auc = grid_tfidf_l1.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidf_l1.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid_tfidf_l1.best_params_)
print('AUC with the best parameters: ', grid_tfidf_l1.best_score_)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L2 Regularization!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidf_l2 = SGDClassifier(loss='hinge', penalty='l2', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidf_l2 = GridSearchCV(model_tfidf_l2, param_grid, cv=3, scoring='roc_auc')
grid_tfidf_l2.fit(X_tr_tfidf, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidf_l2.cv_results_["mean_train_score"]
train_scores_std = grid_tfidf_l2.cv_results_["std_train_score"]
cv_auc = grid_tfidf_l2.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidf_l2.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L2 Regularization: ", grid_tfidf_l2.best_params_)
print('AUC with the best parameters: ', grid_tfidf_l2.best_score_)
Analysis for TFIDF:
%%time
#Citation: plot roc auc curve
#url: https://stackabuse.com/understanding-roc-curves-with-python/
best_alpha = 0.01
best_penalty = 'l2'
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
model_tfidf_final = SGDClassifier(loss='hinge', alpha=best_alpha, penalty=best_penalty, class_weight='balanced', n_jobs = -1)
#Using CalibratedClassifierCV as SVM doesnt natively support probabilities
calibrated_tfidf = CalibratedClassifierCV(model_tfidf_final, method='sigmoid', cv=5)
calibrated_tfidf.fit(X_tr_tfidf, y_train)
def plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr):
plt.plot(train_fpr, train_tpr, color='red', label='ROC for train')
plt.plot(test_fpr, test_tpr, color='orange', label='ROC for test')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('C: Hyperparameter')
plt.ylabel('AUC')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
y_test_pred = calibrated_tfidf.predict_proba(X_te_tfidf)[:, 1]
y_train_pred = calibrated_tfidf.predict_proba(X_tr_tfidf)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
print('AUC of Train Data: %.2f' % auc_train)
auc_test = roc_auc_score(y_test, y_test_pred)
print('AUC of Test Data: %.2f' % auc_test)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)
plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def predict(proba, threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
predictions = []
for i in proba:
if i>=t:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
print("Train confusion matrix")
cm_train = confusion_matrix(y_train, predict(y_train_pred, train_thresholds, train_fpr, train_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_train, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Train Confusion Matrix\n",size=24)
plt.show()
print("Test confusion matrix")
cm_test = confusion_matrix(y_test, predict(y_test_pred, test_thresholds, test_fpr, test_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_test, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Test Confusion Matrix\n",size=24)
plt.show()
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L1 Regularization!!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_avgw2v = SGDClassifier(loss='hinge', penalty='l1', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_avgw2v_l1 = GridSearchCV(model_avgw2v, param_grid, cv=3, scoring='roc_auc')
grid_avgw2v_l1.fit(X_tr_avgw2v, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_avgw2v_l1.cv_results_["mean_train_score"]
train_scores_std = grid_avgw2v_l1.cv_results_["std_train_score"]
cv_auc = grid_avgw2v_l1.cv_results_["mean_test_score"]
cv_scores_std = grid_avgw2v_l1.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L2 Regularization: ", grid_avgw2v_l1.best_params_)
print('AUC with the best parameters: ', grid_avgw2v_l1.best_score_)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_avgw2v_l2 = SGDClassifier(loss='hinge', penalty='l2', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_avgw2v_l2 = GridSearchCV(model_avgw2v_l2, param_grid, cv=3, scoring='roc_auc')
grid_avgw2v_l2.fit(X_tr_avgw2v, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_avgw2v_l2.cv_results_["mean_train_score"]
train_scores_std = grid_avgw2v_l2.cv_results_["std_train_score"]
cv_auc = grid_avgw2v_l2.cv_results_["mean_test_score"]
cv_scores_std = grid_avgw2v_l2.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L2 Regularization: ", grid_avgw2v_l2.best_params_)
print('AUC with the best parameters: ', grid_avgw2v_l2.best_score_)
Analysis for AVG W2V:
%%time
#Citation: plot roc auc curve
#url: https://stackabuse.com/understanding-roc-curves-with-python/
best_alpha = 0.01
best_penalty = 'l2'
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
model_avgw2v_final = SGDClassifier(loss='hinge', alpha=best_alpha, penalty=best_penalty, class_weight='balanced', n_jobs = -1)
#Using CalibratedClassifierCV as SVM doesnt natively support probabilities
calibrated_avgw2v = CalibratedClassifierCV(model_avgw2v_final, method='sigmoid', cv=5)
calibrated_avgw2v.fit(X_tr_avgw2v, y_train)
def plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr):
plt.plot(train_fpr, train_tpr, color='red', label='ROC for train')
plt.plot(test_fpr, test_tpr, color='orange', label='ROC for test')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('C: Hyperparameter')
plt.ylabel('AUC')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
y_test_pred = calibrated_avgw2v.predict_proba(X_te_avgw2v)[:, 1]
y_train_pred = calibrated_avgw2v.predict_proba(X_tr_avgw2v)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
print('AUC of Train Data: %.2f' % auc_train)
auc_test = roc_auc_score(y_test, y_test_pred)
print('AUC of Test Data: %.2f' % auc_test)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)
plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def predict(proba, threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
predictions = []
for i in proba:
if i>=t:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
print("Train confusion matrix")
cm_train = confusion_matrix(y_train, predict(y_train_pred, train_thresholds, train_fpr, train_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_train, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Train Confusion Matrix\n",size=24)
plt.show()
print("Test confusion matrix")
cm_test = confusion_matrix(y_test, predict(y_test_pred, test_thresholds, test_fpr, test_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_test, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Test Confusion Matrix\n",size=24)
plt.show()
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidfavgw2v = SGDClassifier(loss='hinge', penalty='l1', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidfavgw2v_l1 = GridSearchCV(model_tfidfavgw2v, param_grid, cv=5, scoring='roc_auc')
grid_tfidfavgw2v_l1.fit(X_tr_tfidfw2v, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidfavgw2v_l1.cv_results_["mean_train_score"]
train_scores_std = grid_tfidfavgw2v_l1.cv_results_["std_train_score"]
cv_auc = grid_tfidfavgw2v_l1.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidfavgw2v_l1.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid_tfidfavgw2v_l1.best_params_)
print('AUC with the best parameters: ', grid_tfidfavgw2v_l1.best_score_)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidfavgw2v_l2 = SGDClassifier(loss='hinge', penalty='l2', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidfavgw2v_l2 = GridSearchCV(model_tfidfavgw2v_l2, param_grid, cv=5, scoring='roc_auc')
grid_tfidfavgw2v_l2.fit(X_tr_tfidfw2v, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidfavgw2v_l2.cv_results_["mean_train_score"]
train_scores_std = grid_tfidfavgw2v_l2.cv_results_["std_train_score"]
cv_auc = grid_tfidfavgw2v_l2.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidfavgw2v_l2.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L2 Regularization: ", grid_tfidfavgw2v_l2.best_params_)
print('AUC with the best parameters: ', grid_tfidfavgw2v_l2.best_score_)
Analysis for TFIDF W2V:
%%time
#Citation: plot roc auc curve
#url: https://stackabuse.com/understanding-roc-curves-with-python/
best_alpha = 0.01
best_penalty = 'l2'
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
model_tfidfw2v_final = SGDClassifier(loss='hinge', alpha=best_alpha, penalty=best_penalty, class_weight='balanced', n_jobs = -1)
#Using CalibratedClassifierCV as SVM doesnt natively support probabilities
calibrated_tfidfavgw2v = CalibratedClassifierCV(model_avgw2v_final, method='sigmoid', cv=5)
calibrated_tfidfavgw2v.fit(X_tr_avgw2v, y_train)
def plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr):
plt.plot(train_fpr, train_tpr, color='red', label='ROC for train')
plt.plot(test_fpr, test_tpr, color='orange', label='ROC for test')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('C: Hyperparameter')
plt.ylabel('AUC')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
y_test_pred = calibrated_tfidfavgw2v.predict_proba(X_te_tfidfw2v)[:, 1]
y_train_pred = calibrated_tfidfavgw2v.predict_proba(X_tr_tfidfw2v)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
print('AUC of Train Data: %.2f' % auc_train)
auc_test = roc_auc_score(y_test, y_test_pred)
print('AUC of Test Data: %.2f' % auc_test)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)
plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def predict(proba, threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
predictions = []
for i in proba:
if i>=t:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
print("Train confusion matrix")
cm_train = confusion_matrix(y_train, predict(y_train_pred, train_thresholds, train_fpr, train_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_train, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Train Confusion Matrix\n",size=24)
plt.show()
print("Test confusion matrix")
cm_test = confusion_matrix(y_test, predict(y_test_pred, test_thresholds, test_fpr, test_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_test, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Test Confusion Matrix\n",size=24)
plt.show()
Considering The Following Features:
#importing TruncatedSVD
from sklearn import decomposition
TruncSVD = decomposition.TruncatedSVD()
print('Training Matrix Shape:', X_train_essay_tfidf.shape)
print('CV Matrix Shape:', X_cv_essay_tfidf.shape)
print('Test Matrix Shape:', X_test_essay_tfidf.shape)
#Code for dimensionality reduction on training matrix.
#these are principle components and not features
TruncSVD.n_components = 1000
TSVD_train_data = TruncSVD.fit_transform(X_train_essay_tfidf)
#explained_variance gives us the var (lambda_i values) which we divide with summation of those lambda_i values.
percentage_var_explained_train = TruncSVD.explained_variance_ / np.sum(TruncSVD.explained_variance_)
#cumsum keeps adding the lambda_i values in the numerator / with summation of all lambda_i's
cum_var_explained = np.cumsum(percentage_var_explained_train)
# Plot the TruncatedSVD spectrum
plt.figure(1, figsize=(6, 4))
plt.clf()
plt.plot(cum_var_explained, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.show()
#If we take 800-dimensions, a little over 90% of variance is explained for training.
#Code for dimensionality reduction on CV matrix.
#these are principle components and not features
TruncSVD.n_components = 1000
TSVD_cv_data = TruncSVD.fit_transform(X_cv_essay_tfidf)
#explained_variance gives us the var (lambda_i values) which we divide with summation of those lambda_i values.
percentage_var_explained_cv = TruncSVD.explained_variance_ / np.sum(TruncSVD.explained_variance_)
#cumsum keeps adding the lambda_i values in the numerator / with summation of all lambda_i's
cum_var_explained_cv = np.cumsum(percentage_var_explained_cv)
# Plot the TruncatedSVD spectrum
plt.figure(1, figsize=(6, 4))
plt.clf()
plt.plot(cum_var_explained_cv, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.show()
#If we take 800-dimensions, a little over 90% of variance is explained for training
#Code for dimensionality reduction on test matrix.
#these are principle components and not features
TruncSVD.n_components = 1000
TSVD_test_data = TruncSVD.fit_transform(X_test_essay_tfidf)
#explained_variance gives us the var (lambda_i values) which we divide with summation of those lambda_i values.
percentage_var_explained_test = TruncSVD.explained_variance_ / np.sum(TruncSVD.explained_variance_)
#cumsum keeps adding the lambda_i values in the numerator / with summation of all lambda_i's
cum_var_explained_test = np.cumsum(percentage_var_explained_test)
# Plot the TruncatedSVD spectrum
plt.figure(1, figsize=(6, 4))
plt.clf()
plt.plot(cum_var_explained_test, linewidth=2)
plt.axis('tight')
plt.grid()
plt.xlabel('n_components')
plt.ylabel('Cumulative_explained_variance')
plt.show()
#If we take 800-dimensions, a little over 90% of variance is explained for training
Analysis:
from sklearn.decomposition import TruncatedSVD
#n_components = 800 as shown in the plots above
Truncated_SVD_tfidf = TruncatedSVD(n_components=800, random_state=0)
Truncated_SVD_tfidf.fit(X_train_essay_tfidf) #fit only on train data
X_train_trunsvd_tfidf = Truncated_SVD_tfidf.transform(X_train_essay_tfidf)
X_cv_trunsvd_tfidf = Truncated_SVD_tfidf.transform(X_cv_essay_tfidf)
X_test_trunsvd_tfidf = Truncated_SVD_tfidf.transform(X_test_essay_tfidf)
print('Before Vectorization')
print(X_train_essay_tfidf.shape, y_train.shape)
print(X_cv_essay_tfidf.shape, y_cv.shape)
print(X_test_essay_tfidf.shape, y_test.shape)
print('*'*50)
print("After Vectorization")
print(X_train_trunsvd_tfidf.shape, y_train.shape)
print(X_cv_trunsvd_tfidf.shape, y_cv.shape)
print(X_test_trunsvd_tfidf.shape, y_test.shape)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidf_trunsvd = SGDClassifier(loss='hinge', penalty='l1', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidf_trunsvd_l1 = GridSearchCV(model_tfidf_trunsvd, param_grid, cv=3, scoring='roc_auc')
grid_tfidf_trunsvd_l1.fit(X_tr_set5, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidf_trunsvd_l1.cv_results_["mean_train_score"]
train_scores_std = grid_tfidf_trunsvd_l1.cv_results_["std_train_score"]
cv_auc = grid_tfidf_trunsvd_l1.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidf_trunsvd_l1.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid_tfidf_trunsvd_l1.best_params_)
print('AUC with the best parameters: ', grid_tfidf_trunsvd_l1.best_score_)
#Citation:
#This code is copied from here: https://stackoverflow.com/a/48803361/4084039
#With L1 Regularization!!
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
#when loss='hinge' we're performing Linear SVM. (No kernel is used)
model_tfidf_trunsvd_l2 = SGDClassifier(loss='hinge', penalty='l2', n_jobs = -1, class_weight='balanced')
param_grid = {
'alpha': np.logspace(-4, 4, 9)
}
grid_tfidf_trunsvd_l2 = GridSearchCV(model_tfidf_trunsvd_l2, param_grid, cv=3, scoring='roc_auc')
grid_tfidf_trunsvd_l2.fit(X_tr_set5, y_train)
alpha = np.logspace(-4, 4, 9)
train_auc = grid_tfidf_trunsvd_l2.cv_results_["mean_train_score"]
train_scores_std = grid_tfidf_trunsvd_l2.cv_results_["std_train_score"]
cv_auc = grid_tfidf_trunsvd_l2.cv_results_["mean_test_score"]
cv_scores_std = grid_tfidf_trunsvd_l2.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('Hyperparameter: Alpha')
plt.ylabel('AUC')
# plot train scores
plt.semilogx(alpha, train_auc, label='Train AUC', color='darkblue')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
train_auc - train_scores_std,
train_auc + train_scores_std,
alpha=0.2,
color='darkblue')
plt.semilogx(alpha, cv_auc, label='CV AUC', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alpha,
cv_auc - cv_scores_std,
cv_auc + cv_scores_std,
alpha=0.2,
color='darkorange')
plt.scatter(alpha, train_auc, label='Train AUC points', color='darkblue')
plt.scatter(alpha, cv_auc, label='CV AUC points', color='darkorange')
#Citation for plotting the legend outside the plot
#url: https://matplotlib.org/users/legend_guide.html
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
print("Best parameters with L1 Regularization: ", grid_tfidf_trunsvd_l2.best_params_)
print('AUC with the best parameters: ', grid_tfidf_trunsvd_l2.best_score_)
Analysis:
%%time
#Citation: plot roc auc curve
#url: https://stackabuse.com/understanding-roc-curves-with-python/
best_alpha = 0.0001
best_penalty = 'l1'
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
model_tfidf_trunsvd_final = SGDClassifier(loss='hinge', alpha=best_alpha, penalty=best_penalty, class_weight='balanced', n_jobs = -1)
#Using CalibratedClassifierCV as SVM doesnt natively support probabilities
calibrated_tfidf_set5 = CalibratedClassifierCV(model_tfidf_trunsvd_final, method='sigmoid', cv=5)
calibrated_tfidf_set5.fit(X_tr_set5, y_train)
def plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr):
plt.plot(train_fpr, train_tpr, color='red', label='ROC for train')
plt.plot(test_fpr, test_tpr, color='orange', label='ROC for test')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('C: Hyperparameter')
plt.ylabel('AUC')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
y_test_pred = calibrated_tfidf_set5.predict_proba(X_te_set5)[:, 1]
y_train_pred = calibrated_tfidf_set5.predict_proba(X_tr_set5)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
print('AUC of Train Data: %.2f' % auc_train)
auc_test = roc_auc_score(y_test, y_test_pred)
print('AUC of Test Data: %.1f' % auc_test)
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)
plot_roc_curve(test_fpr, test_tpr, train_fpr, train_tpr)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def predict(proba, threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
predictions = []
for i in proba:
if i>=t:
predictions.append(1)
else:
predictions.append(0)
return predictions
from sklearn.metrics import confusion_matrix
print("Train confusion matrix")
cm_train = confusion_matrix(y_train, predict(y_train_pred, train_thresholds, train_fpr, train_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_train, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Train Confusion Matrix\n",size=24)
plt.show()
print("Test confusion matrix")
cm_test = confusion_matrix(y_test, predict(y_test_pred, test_thresholds, test_fpr, test_fpr))
class_names = ['negative','positive']
sns.heatmap(cm_test, annot=True, fmt='d',cmap='viridis')
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Test Confusion Matrix\n",size=24)
plt.show()
#Citation:
#url: http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Index", "Vectorizer", "Model->(Linear SVM)", "Hyper-parameter \u03B1", "Best Regularizer", "AUC"]
x.add_row(["Set 1", "Bag Of Words", "SGD with hinge loss", 0.1, 'L2', 0.72])
x.add_row(["Set 2", "TFIDF", "SGD with hinge loss", 0.01, 'L2', 0.70])
x.add_row(["Set 3", "AVG W2V", "SGD with hinge loss", 0.01, 'L2', 0.70])
x.add_row(["Set 4", "TFIDF", "SGD with hinge loss", 0.01, 'L2', 0.70])
x.add_row(["Set 5", "TFIDF on TruncatedSVD(Essay_Text)", "SGD with hinge loss", 0.0001, 'L1', 0.70])
print(x)